# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(caret) # may not need anymore
library(timetk)
library(rsample)
# Load data:
df <- read_csv("./data/walmart/Walmart.csv")
head(df)
Fix Datatypes, Sort Values
# Fixing datatypes:
# any(grepl("/", df$Date))
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment
"numeric" "character" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
df$Date <- gsub("/", "-", df$Date) # replace slashes with dashes, so all dates follow same format
df$Date <- as.Date(df$Date, format = "%d-%m-%Y") # NOTE european-like date: day/month/year
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment
"numeric" "Date" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# Sort values:
df <- df[order(df$Date, decreasing = FALSE), ] # gets an ordering of rows based on date and then df[ix] to select that order
head(df)
print(dim(df))
[1] 6435 8
table(df$Date) # value_counts
2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 2010-03-12 2010-03-19 2010-03-26 2010-04-02 2010-04-09 2010-04-16 2010-04-23 2010-04-30 2010-05-07 2010-05-14 2010-05-21 2010-05-28 2010-06-04
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2010-06-11 2010-06-18 2010-06-25 2010-07-02 2010-07-09 2010-07-16 2010-07-23 2010-07-30 2010-08-06 2010-08-13 2010-08-20 2010-08-27 2010-09-03 2010-09-10 2010-09-17 2010-09-24 2010-10-01 2010-10-08
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2010-10-15 2010-10-22 2010-10-29 2010-11-05 2010-11-12 2010-11-19 2010-11-26 2010-12-03 2010-12-10 2010-12-17 2010-12-24 2010-12-31 2011-01-07 2011-01-14 2011-01-21 2011-01-28 2011-02-04 2011-02-11
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-02-18 2011-02-25 2011-03-04 2011-03-11 2011-03-18 2011-03-25 2011-04-01 2011-04-08 2011-04-15 2011-04-22 2011-04-29 2011-05-06 2011-05-13 2011-05-20 2011-05-27 2011-06-03 2011-06-10 2011-06-17
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-06-24 2011-07-01 2011-07-08 2011-07-15 2011-07-22 2011-07-29 2011-08-05 2011-08-12 2011-08-19 2011-08-26 2011-09-02 2011-09-09 2011-09-16 2011-09-23 2011-09-30 2011-10-07 2011-10-14 2011-10-21
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2011-10-28 2011-11-04 2011-11-11 2011-11-18 2011-11-25 2011-12-02 2011-12-09 2011-12-16 2011-12-23 2011-12-30 2012-01-06 2012-01-13 2012-01-20 2012-01-27 2012-02-03 2012-02-10 2012-02-17 2012-02-24
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2012-03-02 2012-03-09 2012-03-16 2012-03-23 2012-03-30 2012-04-06 2012-04-13 2012-04-20 2012-04-27 2012-05-04 2012-05-11 2012-05-18 2012-05-25 2012-06-01 2012-06-08 2012-06-15 2012-06-22 2012-06-29
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
2012-07-06 2012-07-13 2012-07-20 2012-07-27 2012-08-03 2012-08-10 2012-08-17 2012-08-24 2012-08-31 2012-09-07 2012-09-14 2012-09-21 2012-09-28 2012-10-05 2012-10-12 2012-10-19 2012-10-26
45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45
There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.
Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable: i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean
df <- df %>%
group_by(Store) %>%
mutate(
sales_mean = mean(Weekly_Sales, na.rm = TRUE),
sales_std = sd(Weekly_Sales, na.rm = TRUE),
weekly_sales_scaled = (Weekly_Sales - sales_mean) / sales_std
) %>%
ungroup()
head(df)
for (store_id in 1:length(unique(df$Store))) {
p <- df %>%
filter(Store == store_id) %>%
ggplot(aes(x = Date, y = weekly_sales_scaled)) +
geom_line(color = "steelblue") +
labs(title = paste("Weekly Sales Over Time - Store", store_id),
x = "Date",
y = "Weekly Sales (scaled per store)")
print(p)
}
NA
NA
# (Can't randomize the data first, because it is time-series)
# Train/Test Split:
split_ix <- floor(.8*nrow(df))
train_df <- df[1:split_ix,]
test_df <- df[(split_ix+1):nrow(df),]
print(dim(train_df))
[1] 5148 11
print(dim(test_df))
[1] 1287 11
cat("train:", format(min(train_df$Date), "%Y-%m-%d"), " to ", format(max(train_df$Date), "%Y-%m-%d"),"\n")
train: 2010-02-05 to 2012-04-13
cat("test:", format(min(test_df$Date), "%Y-%m-%d"), " to ", format(max(test_df$Date), "%Y-%m-%d"),"\n")
test: 2012-04-13 to 2012-10-26
print(colnames(df))
[1] "Store" "Date" "Weekly_Sales" "Holiday_Flag" "Temperature" "Fuel_Price" "CPI" "Unemployment"
[9] "sales_mean" "sales_std" "weekly_sales_scaled"
# Define design matrix pattern (to make it easier to form X's during cross val):
design_matrix_formula <- weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0 # 0 means no intercept, glmnet will add its own; here we drop columns
# model.matrix creates a design matrix from formula, auto-encodes categorical as dummy vars (if any--in this case, holiday is already a double.)
X_test <- model.matrix(design_matrix_formula, data=test_df)
y_test <- test_df$weekly_sales_scaled
print(dim(X_test))
[1] 1287 5
print(X_test[1:10,]) # see example of columns left for prediction
Holiday_Flag Temperature Fuel_Price CPI Unemployment
1 0 44.42 4.187 137.8680 8.150
2 0 45.68 4.044 214.3127 7.139
3 0 69.03 3.891 221.1484 6.891
4 0 49.89 4.025 141.8434 7.671
5 0 41.81 4.025 137.8680 4.125
6 0 48.65 4.187 137.8680 8.983
7 0 42.46 4.044 214.3127 7.139
8 0 36.90 4.025 137.8680 7.489
9 0 52.22 4.187 141.8434 8.253
10 0 64.28 4.254 131.1080 11.627
Because, it is important to keep the future in the test, or else it could cheat and accidentally learn the past and use it to predict the past, some inherent pattern. i.e. could use info from the future to predict the past (leakage).
# NOTE - ctrl shift c to mass uncomment/comment out
# Attempt 1 - shorter
# cv_folds <- time_series_cv(
# data = train_df,
# date_var = Date,
# cumulative = FALSE, # no growing window (same size folds)
# initial = "6 months", # length of train window
# assess = "3 months", # length of validation window
# skip = "3 months", # jump between folds
# slice_limit = 5 # num folds max
# )
# Attempt 2-- longer, cumulative windowing
cv_folds <- time_series_cv(
data = train_df,
date_var = Date,
cumulative = TRUE, # growing window (same size folds)
initial = "1 year", # length of train window
assess = "3 months", # length of validation window
skip = "3 months", # jump between folds
slice_limit = 5 # num folds max
)
plot_time_series_cv_plan(cv_folds, .date_var = Date, .value = weekly_sales_scaled, .interactive=TRUE)
# === Define Lambda search grid ===
# Easy way to get lambda grid with good scaling--from the glmnet function itself.
# Note that we will not keep this model fit, this is solely for the purpose of getting
# a lambda grid for our manual CV.
unused_X_all <- model.matrix(design_matrix_formula, data=train_df)
unused_y_all <- train_df$weekly_sales_scaled
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
[1] 0.1450961864 0.1322062411 0.1204614030 0.1097599440 0.1000091731 0.0911246338 0.0830293725 0.0756532718 0.0689324437 0.0628086754 0.0572289258 0.0521448657 0.0475124596 0.0432915836 0.0394456786
[16] 0.0359414333 0.0327484954 0.0298392093 0.0271883762 0.0247730358 0.0225722676 0.0205670095 0.0187398931 0.0170750926 0.0155581885 0.0141760419 0.0129166814 0.0117691990 0.0107236558 0.0097709958
[31] 0.0089029675 0.0081120524 0.0073914000 0.0067347684 0.0061364701 0.0055913230 0.0050946053 0.0046420146 0.0042296308 0.0038538821 0.0035115138 0.0031995606 0.0029153205 0.0026563314 0.0024203503
[46] 0.0022053330 0.0020094173 0.0018309062 0.0016682536 0.0015200505 0.0013850134 0.0012619726 0.0011498625 0.0010477119 0.0009546360 0.0008698288 0.0007925556 0.0007221471 0.0006579936 0.0005995392
[61] 0.0005462778 0.0004977480 0.0004535294 0.0004132391
n_folds <- length(cv_folds$splits)
n_lambdas <- length(lambda_grid)
cv_mses <- numeric(n_lambdas) # keep track of MSE across different lambdas
lambda_ix <- 1
for (lambda in lambda_grid) {
fold_mses <- numeric(n_folds) # array for keeping track of mse across folds
for (fold_ix in 1:n_folds) {
# Get train, valid data for current cv fold:
fold <- cv_folds$splits[[fold_ix]]
train_data <- analysis(fold)
valid_data <- assessment(fold)
# Manipulate data into design matrices:
X_train <- model.matrix(design_matrix_formula, data=train_data)
y_train <- train_data$weekly_sales_scaled
X_val <- model.matrix(design_matrix_formula, data=valid_data)
y_val <- valid_data$weekly_sales_scaled
# Fit LASSO (non-cv version):
lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=lambda)
# Get val error:
yhat_val <- predict(lasso_model, newx=X_val, s=lambda)
fold_mses[fold_ix] <- mean((y_val - yhat_val)^2)
}
cv_mses[lambda_ix] <- mean(fold_mses)
lambda_ix <- lambda_ix + 1
}
best_lambda <- lambda_grid[which.min(cv_mses)]
cat("best lambda: ", best_lambda, "with min mse ", min(cv_mses), "\n\n")
best lambda: 0.06893244 with min mse 1.000508
plot(x=lambda_grid,
y=cv_mses,
main = paste0(n_folds, "-Fold Cross-Validation MSE for different Lambdas"),
xlab = "Lambda",
ylab = "Mean CV MSE")
#best_ix <- which.min(cv_mses)
#points(best_lambda, cv_mses[best_ix], col = "red", pch = 19, cex = 1.5)
#text(best_lambda, cv_mses[best_ix],
# labels = paste("Best λ =", round(best_lambda, 4)),
# pos = 4, col = "red")
X_train <- model.matrix(design_matrix_formula, data=train_df)
y_train <- train_df$weekly_sales_scaled
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)
# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)
mse_test <- mean((y_test - yhat_test)^2)
cat("Test MSE: ", mse_test, "\n\n")
Test MSE: 0.4069579
coef(best_lasso_model)
6 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.078758823
Holiday_Flag 0.262235923
Temperature -0.001764706
Fuel_Price .
CPI .
Unemployment .
First CV param attempt notes: The above serves as a baseline, when using no time-averaging or time-lagged features. It could just predict the mean, not finding a strong signal with the other features. Next, we will need to try incorporating time. In the meantime, some things I will try are changing the folds and lambdas used.
second CV param attempt notes: Second attempt, used 1 year initial, and cumulative folds. Now holidays and temperature matter, which is better than the first attempt, which had 0’s for all features and intercept was the only kept feature. MSE stayed the same.
LASSO - baseline, NO lagged variables.
Lasso Path–shows variables entering and leaving as lambda (regularization) increases. (NOT cross val).
fit_path <- glmnet(X_train, y_train, alpha = 1) # doesnt take best lambda, fits a bunch of lambdas, no CV
plot(fit_path, xvar = "lambda", label = TRUE)
title("LASSO Coefficient Paths")
beta_mat <- as.matrix(fit_path$beta)
#rownames(beta_mat)
variable_map <- data.frame(
index = seq_len(nrow(beta_mat)),
variable = rownames(beta_mat)
)
print(variable_map)
# 1) Extract coefficients for each tested lambda value, and convert datatype to R matrix:
Beta_lambda <- as.matrix(coef(fit_path, s=fit_path$lambda))
# 2) Even tho I didn't specify intercept, there is a intercept row of 0's.
#Beta_lambda <- Beta_lambda[-1,, drop=FALSE] # all rows except row 1, all columns; dont change dimensions
# print(Beta_lambda)
cat('Beta_lambda shape:', dim(Beta_lambda), '\n') # shape: (p x #lambdas
Beta_lambda shape: 6 64
# 3) Get L0, L1 norms:
l1_norm = colSums(abs(Beta_lambda)) # sum up each betah_lambda col; colSums computes sum of each column
l0_norm = colSums(Beta_lambda != 0) # number of nonzero coeffs for each lambda
# 4) Want the plot's x axis to be increasing model complexity (incr L1 norm to the right)
incr_order_ixs = order(l1_norm) # returns indices to make it incr order
l1_norm_ordered = l1_norm[incr_order_ixs]
l0_norm_ordered = l0_norm[incr_order_ixs]
Beta_lambda_ordered = Beta_lambda[,incr_order_ixs,drop=FALSE]
# 5) Plot coeff vs. L1 norm:
# note: currently, Beta_lambda's columns are for each lambda, but matplot plots columns of y
# as separate lines. We want a line for each coeff (p). Beta_lambda is (p x #lambdas).
# so, we take the transpose of beta.
# also, num rows should match for x (#lambdas x 1) and y (p x #lambdas).
cat('L1 norm shape:', length(l1_norm_ordered), '\n')
L1 norm shape: 64
cat('Beta_lambda shape:', dim(Beta_lambda_ordered), '\n')
Beta_lambda shape: 6 64
matplot(x=l1_norm, y=t(Beta_lambda),
type="l",# plot type: line
lwd=2, # line width
lty=1, # line type (solid, dashed)
xlab=expression("L1 Norm (" * "||" * hat(beta[lambda]) * "||"[1] * ")"),
ylab="Coefficients",
main="Lasso Path")
abline(h = 0, col = "black", lwd = 1, lty = 2) # show x axis more easily